Import packages and data

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
## Get current Data in the four files


url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

file_names <- c("time_series_covid19_confirmed_global.csv",
                "time_series_covid19_deaths_global.csv"
                )
urls <- str_c(url_in, file_names)

url_in_pop <- "https://raw.githubusercontent.com/datasets/population-growth-estimates-and-projections/master/data/"
file_name_pop <- "population-constant-fertility.csv"
urls_pop <- str_c(url_in_pop, file_name_pop)

Let’s read in the data and see what we have.

global_cases <- read_csv(urls[1])
global_deaths <- read_csv(urls[2])
world_pop <- read_csv(urls_pop)
world_pop$Population <- as.double(world_pop$Population) * 1000
## Warning: 강제형변환에 의해 생성된 NA 입니다
global_cases_pivot <- global_cases %>%
  pivot_longer(cols = -c('Province/State',
                         'Country/Region', Lat, Long),
               names_to = "date",
               values_to = "cases") %>%
  select(-c(Lat,Long))

global_deaths_pivot <- global_deaths %>%
  pivot_longer(cols = -c('Province/State',
                         'Country/Region', Lat, Long),
               names_to = "date",
               values_to = "deaths") %>%
  select(-c(Lat,Long))
global_cases_pivot <- global_cases_pivot %>%
  rename(Country_Region = `Country/Region`, Province_State = `Province/State`) %>%
  mutate(date=mdy(date))

global_deaths_pivot <- global_deaths_pivot %>%
  rename(Country_Region = `Country/Region`, Province_State = `Province/State`) %>%
  mutate(date=mdy(date))
global_cases_pivot <- global_cases_pivot %>% filter(cases > 0)
global_deaths_pivot <- global_deaths_pivot %>% filter(deaths > 0)
global_cases_pivot$Year <- format(global_cases_pivot$date, format="%Y")
global_deaths_pivot$Year <- format(global_deaths_pivot$date, format="%Y")
global_cases_pivot$Country_Region2 = global_cases_pivot$Country_Region
global_deaths_pivot$Country_Region2 = global_deaths_pivot$Country_Region

Amending global population data

Some country name in Covid 19 data set and population data set are different.
To merge them, replace the country names in Covid 19 data.

global_cases_pivot[global_cases_pivot$Country_Region2=="Korea, North", "Country_Region2"] <- "Dem. People's Republic of Korea"
global_cases_pivot[global_cases_pivot$Country_Region2=="Korea, South", "Country_Region2"] <- "Republic of Korea"
global_cases_pivot[global_cases_pivot$Country_Region2=="Bolivia", "Country_Region2"] <- "Bolivia (Plurinational State of)"
global_cases_pivot[global_cases_pivot$Country_Region2=="Brunei", "Country_Region2"] <- "Brunei Darussalam"
global_cases_pivot[global_cases_pivot$Country_Region2=="Congo (Brazzaville)", "Country_Region2"] <- "Congo"
global_cases_pivot[global_cases_pivot$Country_Region2=="Congo (Kinshasa)", "Country_Region2"] <- "Congo"
global_cases_pivot[global_cases_pivot$Country_Region2=="Cote d'Ivoire", "Country_Region2"] <- "Côte d'Ivoire"
global_cases_pivot[global_cases_pivot$Country_Region2=="Iran", "Country_Region2"] <- "Iran (Islamic Republic of)"
global_cases_pivot[global_cases_pivot$Country_Region2=="Kosovo", "Country_Region2"] <- "Republic of Korea"
global_cases_pivot[global_cases_pivot$Country_Region2=="Laos", "Country_Region2"] <- "Lao People's Democratic Republic" 
global_cases_pivot[global_cases_pivot$Country_Region2=="Russia", "Country_Region2"] <- "Russian Federation"
global_cases_pivot[global_cases_pivot$Country_Region2=="Syria", "Country_Region2"] <- "Syrian Arab Republic"
global_cases_pivot[global_cases_pivot$Country_Region2=="Taiwan*", "Country_Region2"] <- "China, Taiwan Province of China"
global_cases_pivot[global_cases_pivot$Country_Region2=="US", "Country_Region2"] <- "United States of America"
global_cases_pivot[global_cases_pivot$Country_Region2=="Venezuela", "Country_Region2"] <- "Venezuela (Bolivarian Republic of)"
global_cases_pivot[global_cases_pivot$Country_Region2=="Vietnam", "Country_Region2"] <- "Viet Nam"

global_deaths_pivot[global_deaths_pivot$Country_Region2=="Korea, North", "Country_Region2"] <- "Dem. People's Republic of Korea"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Korea, South", "Country_Region2"] <- "Republic of Korea"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Bolivia", "Country_Region2"] <- "Bolivia (Plurinational State of)"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Brunei", "Country_Region2"] <- "Brunei Darussalam"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Congo (Brazzaville)", "Country_Region2"] <- "Congo"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Congo (Kinshasa)", "Country_Region2"] <- "Congo"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Cote d'Ivoire", "Country_Region2"] <- "Côte d'Ivoire"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Iran", "Country_Region2"] <- "Iran (Islamic Republic of)"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Kosovo", "Country_Region2"] <- "Republic of Korea"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Laos", "Country_Region2"] <- "Lao People's Democratic Republic" 
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Russia", "Country_Region2"] <- "Russian Federation"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Syria", "Country_Region2"] <- "Syrian Arab Republic"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Taiwan*", "Country_Region2"] <- "China, Taiwan Province of China"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="US", "Country_Region2"] <- "United States of America"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Venezuela", "Country_Region2"] <- "Venezuela (Bolivarian Republic of)"
global_deaths_pivot[global_deaths_pivot$Country_Region2=="Vietnam", "Country_Region2"] <- "Viet Nam"

Below Country has no Population data
* Antarctica * Burma * Diamon Princess * Kosovo * Moldova * MS Zaandam * Tanzania * West Bank and Gaza

global_cases_pivot <- merge(global_cases_pivot,world_pop, by.x=c("Country_Region2","Year"), by.y=c("Region","Year"), all.x = TRUE)
global_deaths_pivot <- merge(global_deaths_pivot,world_pop, by.x=c("Country_Region2","Year"), by.y=c("Region","Year"), all.x = TRUE)
global_cases_pivot <- global_cases_pivot[!(is.na(global_cases_pivot$Population)),]
global_deaths_pivot <- global_deaths_pivot[!(is.na(global_deaths_pivot$Population)),]

Population Bias

To know the most suffered country and the safest country in the world, we need to compare number of cases and deaths.
However, each country has different population and more populated country likely to have more cases and deaths.
So comparing cases and deaths has no meaning.
Calculate average cases and deaths per thousand people to compare countries.

global_cases_pivot$case_per_thousand <- global_cases_pivot$cases / global_cases_pivot$Population * 1000
global_deaths_pivot$death_per_thousand <- global_deaths_pivot$deaths / global_deaths_pivot$Population * 1000
average_cases = global_cases_pivot %>% group_by(Country_Region) %>% summarise(average_case_per_thousand = mean(case_per_thousand), .groups = 'drop')
average_cases <- average_cases[!is.na(average_cases$average_case_per_thousand),]
average_cases <- average_cases[average_cases$Country_Region!="Korea, North",]
worst_case_country = average_cases[average_cases$average_case_per_thousand==max(average_cases$average_case_per_thousand), "Country_Region"]
best_case_country = average_cases[average_cases$average_case_per_thousand==min(average_cases$average_case_per_thousand), "Country_Region"]

average_deaths = global_deaths_pivot %>% group_by(Country_Region) %>% summarise(average_death_per_thousand = mean(death_per_thousand), .groups = 'drop')
average_deaths <- average_deaths[!is.na(average_deaths$average_death_per_thousand),]
worst_death_country = average_deaths[average_deaths$average_death_per_thousand==max(average_deaths$average_death_per_thousand), "Country_Region"]
best_death_country <- average_deaths[average_deaths$average_death_per_thousand==min(average_deaths$average_death_per_thousand), "Country_Region"]

North Korea has the lowest average cases per thousand people.
But this country does not share proper information to outside of the country.
This data can not be trusted.

Trend of the best and the worst case countries

Micronesia is the safest country where has the lowest average cases per thousand people.
On the other hand, cases in the Andorra has been growing continuously and average cases per thousand people is the highest.

best_case_pivot = global_cases_pivot[global_cases_pivot$Country_Region==best_case_country[[1]], ]
ggplot(best_case_pivot, aes(x=date, y=case_per_thousand)) +
  geom_area(fill="seagreen", alpha=0.4) +
  geom_line(color="seagreen", size=2) +
  geom_point(size=3, color="seagreen") +
  theme_ipsum() +
  ggtitle(paste("The lowest case country - ", best_case_country[[1]])) +
  ylab("Cases per thousand ppl") +
  xlab("Date")

worst_case_pivot = global_cases_pivot[global_cases_pivot$Country_Region==worst_case_country[[1]], ]
ggplot(worst_case_pivot, aes(x=date, y=case_per_thousand)) +
  geom_area(fill="violetred", alpha=0.4) +
  geom_line(color="violetred", size=2) +
  geom_point(size=3, color="violetred") +
  theme_ipsum() +
  ggtitle(paste("The highest case country - ", worst_case_country[[1]])) +
  ylab("Cases per thousand ppl") +
  xlab("Date")

Trend of the best and the worst death countries

Although China is known as the origin of Covid 19, China has the lowest average death per thousand people.
It turns out that Peru is the most suffered country in the world.

best_death_pivot = global_deaths_pivot[global_deaths_pivot$Country_Region==best_death_country[[1]], ]
ggplot(best_death_pivot, aes(x=date, y=death_per_thousand)) +
  geom_area(fill="seagreen", alpha=0.4) +
  geom_line(color="seagreen", size=2) +
  geom_point(size=3, color="seagreen") +
  theme_ipsum() +
  ggtitle(paste("The lowest death country - ", best_death_country[[1]])) +
  ylab("Deaths per thousand ppl") +
  xlab("Date")

worst_death_pivot = global_deaths_pivot[global_deaths_pivot$Country_Region==worst_death_country[[1]], ]
ggplot(worst_death_pivot, aes(x=date, y=death_per_thousand)) +
  geom_area(fill="violetred", alpha=0.4) +
  geom_line(color="violetred", size=2) +
  geom_point(size=3, color="violetred") +
  theme_ipsum() +
  ggtitle(paste("The highest death country - ", worst_death_country[[1]])) +
  ylab("Deaths per thousand ppl") +
  xlab("Date")

Predicting global case trend

Lastly, I tried to predict global cases in the future (July,2022 to December,2024) using only Year and Month variables.
Unfortunately, prediction shows increasing pattern. It must be not accurate since I used too small amount variables.
I hope this prediction is incorrect and we can recover from Covid 19 soon.

global_cases_pivot$Month <- format(global_cases_pivot$date, format="%m")
total_cases_pivot <- global_cases_pivot %>% group_by(Year,Month) %>% summarise(cases=sum(cases), .groups = 'drop')

total_cases_pivot$Year <- as.numeric(total_cases_pivot$Year)
total_cases_pivot$Month <- as.numeric(total_cases_pivot$Month)
mod <- lm(cases ~ Year+Month, data = total_cases_pivot)


month1 <- function(x) as.Date(cut(x, "month"))
start_date <- as.Date("2022-07-01")
end_date <- as.Date("2024-12-01")
months <- seq(month1(start_date), month1(end_date), "month")
df <- data.frame(start = pmax(start_date, months),
           end = pmin(end_date, month1(months + 31) - 1),
           Month = format(months, "%m"),
           Year = as.numeric(format(months, "%Y")),
           stringsAsFactors = FALSE) %>% select(Year,Month)

df$Year_Month <- str_c(df$Year,"_", str_pad(df$Month,2,pad="0"))
df$Year <- as.numeric(df$Year)
df$Month <- as.numeric(df$Month)

pred <- predict(mod, df)
df$future_cases <- pred
ggplot(df, aes(x=(Year_Month), y=future_cases)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Global cases prediction") +
  ylab("Cases")